In this vignette, we present a practical example of using the sunflower package to work with datasets that include a column of responses containing multiple answers. We demonstrate how to convert the dataset into a long format to obtain formal similarity metrics. Additionally, we illustrate how to perform error classification based on classical criteria found in the literature (e.g., Dell et al., 1997; Gold & Kertesz, 2001; see also, García-Orza et al., 2020).

0. Dependencies

require(sunflower) # to work
#> Loading required package: sunflower
require(tidyverse) # to work along sunflower 
#> Loading required package: tidyverse
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
require(knitr) # to work in Rmarkdown
#> Loading required package: knitr
#> Warning: package 'knitr' was built under R version 4.2.3
require(kableExtra) # to work in Rmarkdown
#> Loading required package: kableExtra
#> Warning: package 'kableExtra' was built under R version 4.2.3
#> 
#> Attaching package: 'kableExtra'
#> 
#> The following object is masked from 'package:dplyr':
#> 
#>     group_rows
require(rmarkdown) # to work in Rmarkdown
#> Loading required package: rmarkdown
#> Warning: package 'rmarkdown' was built under R version 4.2.3
require(ggplot2) #to plot

1. Managing multiple responses

Load the data IGC allocated in the package, select some columns to keep and visualize it.


IGC <- readxl::read_xlsx("data/1-Gutiérrez-Cordero_data_RAW.xlsx") %>% 
  dplyr::select(test, task_ID, task_type, ID, item_ID = task_item_ID, item, final_response, correct) %>% 
  dplyr::filter(test %in% c("SnodgrassVanderwart", "BETA", "EPLA", "Gutiérrez-Cordero")) %>% 
  dplyr::filter(!str_detect(task_type, "nonword")) %>% 
  dplyr::arrange(ID)

View(IGC)

rmarkdown::paged_table(IGC %>% dplyr::select(c(item_ID, item, final_response)), options = list(rows.print = 8, align = "ccc"))

  # Convertir el dataframe en un objeto tipo tabla usando tableGrob
  table_plot <- gridExtra::tableGrob(IGC %>% dplyr::select(c(ID, item, final_response)) %>%
    dplyr::slice_head(n = 11))
  
  # Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
  plot <- ggplot() +
    theme_void() +  # Eliminar todos los elementos gráficos
    annotation_custom(table_plot)
  
  # Mostrar el gráfico
  plot

  
  ggsave("artwork/figure1.png", plot, width = 5.5, height = 3.75, dpi = 600)
  ggsave("artwork/figure1.svg", plot, width = 5.5, height = 3.75, dpi = 600)

Separate the data using the separate_responses() function and them rearrange as long format using the join_responses() to work in the following step.


IGC_step1 = IGC %>% separate_responses(col_name = "final_response",
                            separate_with = ", ") %>% 
                        get_attempts(first_production = Attempt_1, drop_blank_spaces = T)

IGC_step1_skinnydf = IGC_step1 %>% dplyr::select(-c(task_type))

rmarkdown::paged_table(IGC_step1_skinnydf, options = list(rows.print = 25, align = "ccc"))
  
  # Convertir el dataframe en un objeto tipo tabla usando tableGrob
    table_plot2 <- gridExtra::tableGrob(IGC_step1_skinnydf %>% dplyr::select(c(ID, item, attempt = Attempt, response = Response)) %>%
      dplyr::slice_head(n = 15))
    
    # Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
    plot2 <- ggplot2::ggplot() +
      ggplot2::theme_void() +  # Eliminar todos los elementos gráficos
      ggplot2::annotation_custom(table_plot2)
    
    # Mostrar el gráfico
    plot2

    
    ggsave("artwork/figure2.png", plot2, width = 4.25, height = 4.75, dpi = 600)
    ggsave("artwork/figure2.svg", plot2, width = 4.25, height = 4.75, dpi = 600)

2. Formal Analysis

Compute the similarity metrics using the get_formal_similarity() function.


IGC_step2 = IGC_step1 %>% get_formal_similarity(item_col = "item",
                                               response_col = "Response",
                                               attempt_col = "Attempt",
                                               group_cols = c("ID", "task_ID"))
#> The function get_formal_similarity() took 3.73 seconds to be executed

 # Define terms to remove
terms_to_remove <- c("el", "la", "un", "una", "vaya", 
                     "los", "las", "unos", "unas", "no")

# Create a pattern to match the terms
pattern <- paste0("\\b(", paste(terms_to_remove, collapse = "|"), ")\\b")

# Filter the DataFrame to exclude rows containing the terms in 'Response'
IGC_step2_skinnydf <- IGC_step2 %>%
  dplyr::filter(!str_detect(Response, pattern))


IGC_step2_skinnydf = IGC_step2 %>% dplyr::select(-c(task_type))

rmarkdown::paged_table(IGC_step2_skinnydf, options = list(rows.print = 25, align = "c"))


  # Convertir el dataframe en un objeto tipo tabla usando tableGrob
      table_plot3 <- gridExtra::tableGrob(
        IGC_step2_skinnydf %>% dplyr::select(-c(responseL, targetL)) %>%
            dplyr::mutate(across(where(is.numeric), ~ round(., 3))) %>% 
            dplyr::select(-comment_warning)%>%
            #dplyr::select(c(item_ID, item, Response, RA, Attempt, shared1char:DLd, JWd, pcc, CdA_diff = approach_diff)) %>%
          dplyr::rename(diff_chars = diff_char_num, attempt = Attempt, response = Response) %>%
          dplyr::slice_head(n = 15) # Seleccionar las primeras 7 filas
      )

  # Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
    plot3 <- ggplot() +
      theme_void() +  # Eliminar todos los elementos gráficos
      annotation_custom(table_plot3)
    
  # Mostrar el gráfico
    plot3

    
    ggsave("artwork/figure4.png", plot3, width = 21, height = 5, dpi = 600)
    ggsave("artwork/figure4.svg", plot3, width = 21, height = 5, dpi = 600)

2.1. Positional Analysis

Obtain the correct characters, in this case, letters, in their correct position using the positional_accuracy() function.


IGC_step2.1 = IGC_step2 %>% positional_accuracy(match_col = "itemL_adj_strict_match_pos",
                                          last_ID_col = "targetL")
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`
#> • `` -> `...13`
#> • `` -> `...14`

IGC_step2.1_skinnydf = IGC_step2.1 %>% dplyr::select(-c(task_ID, correct, task_type)) %>%
            dplyr::rename(attempt = Attempt, response = Response, position = Position)

rmarkdown::paged_table(IGC_step2.1_skinnydf, options = list(rows.print = 25, align = "c"))

This is a file (generated following the procedure described by [Dueñas Lerín] (https://duenaslerin.com/diccionario-palabras-espanol-en-texto-script/)) containing all the words in Spanish as available in the RAE dictionary. It can be downloaded from the author’s page at https://github.com/JorgeDuenasLerin/diccionario-espanol-txt.


m_w2v = word2vec::read.word2vec(file = file.choose(), normalize = F)

This is a file (generated using the word2vec algorithm by Cardellino) containing the embeddings of 1.5 billion words. It can be downloaded from the author’s page or in another mirror at Github, where others corpuses can be assessed; further details provided by the author here


IGC_step2_clean = IGC %>%
                        separate_responses(
                          col_name = "final_response",
                          separate_with = ", ") %>% 
                        get_attempts(
                          first_production = Attempt_1, drop_blank_spaces = T)  %>%
                        dplyr::select(task_ID, ID, item_ID, task_type, item, Response, RA, Attempt) %>% 
                        get_formal_similarity(item_col = "item", response_col = "Response",
                          attempt_col = "Attempt",
                          group_cols = c("ID", "item_ID"))
#> The function get_formal_similarity() took 2.70 seconds to be executed

IGC_step2clean_skinnydf = IGC_step2_clean %>% dplyr::select(-c(task_ID, task_type))

rmarkdown::paged_table(IGC_step2clean_skinnydf, options = list(rows.print = 25, align = "c"))

# remove some values leaving NAs to check that the functions work correctly

IGC_step2_cleanNA = IGC_step2_clean %>%
  dplyr::mutate(
    Response = dplyr::if_else(dplyr::row_number() == 2, NA_character_, Response),
    item = dplyr::if_else(dplyr::row_number() == 3, NA_character_, item)
  )

IGC_step3 <- IGC_step2_cleanNA %>%
  check_lexicality(item_col = "item", response_col = "Response", criterion = "database") %>%
  get_formal_similarity(item_col = "item", response_col = "Response",
                          attempt_col = "Attempt",
                          group_cols = c("ID", "item_ID")) %>%
  get_semantic_similarity(item_col = "item", response_col = "Response", model = m_w2v)
#> The function check_lexicality() took 4.67 seconds to be executed
#> The function get_formal_similarity() took 6.78 seconds to be executed
#> The function get_semantic_similarity() took 7.95 seconds to be executed

# Compute accessed col

IGC_step3 = IGC_step3 %>% 
    dplyr::mutate(accessed = dplyr::if_else(Response == item, 1, 0))

IGC_step3_skinnydf = IGC_step3 %>% dplyr::select(-c(item_ID, task_type))

rmarkdown::paged_table(IGC_step3_skinnydf, options = list(rows.print = 25, align = "c"))

Proceed with errors classification


IGC_step4 <- IGC_step3 %>% classify_errors(access_col = "accessed", 
                                           RA_col = "RA",
                                           response_col = "Response", 
                                           item_col = "item",
                                           also_classify_RAs = T) %>%
  dplyr::mutate(general_ID = dplyr::row_number())
#> The function classify_errors() took 0.01 seconds to be executed

View(IGC_step4)

IGC_step4_print_skinny <- IGC_step4 %>%
  dplyr::filter(general_ID %in% c(8, 156, 13, 3284, 222, 3448, 5658)) %>% ####AQUI SELECCIONAR UN EJEMPLO DE CADA
  dplyr::select(general_ID, ID, item_ID, item, Response, RA, 
                Attempt, lexicality, cosine_similarity, nonword:no_response, comment) %>% 
  dplyr::rename(w2v_cos = cosine_similarity, item_ID = item_ID, 
         attempt = Attempt, response = Response) #%>% dplyr::select(-c(general_ID, no_response))

rmarkdown::paged_table(IGC_step4_print_skinny, options = list(rows.print = 15, align = "c"))


  # Convertir el dataframe en un objeto tipo tabla usando tableGrob
      table_plot4 <- gridExtra::tableGrob(
        IGC_step4_print_skinny %>%
          dplyr::slice_head(n = 7)  # Seleccionar las primeras 7 filas
      )

    # Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
    plot4 <- ggplot() +
      theme_void() +  # Eliminar todos los elementos gráficos
      annotation_custom(table_plot4)
    
    # Mostrar el gráfico
    plot4

    
    ggsave("artwork/figure6.png", plot4, width = 15, height = 3.25, dpi = 600)
    ggsave("artwork/figure6.svg", plot4, width = 15, height = 3.25, dpi = 600)